#!/bin/bash
#===============================================================================
# 配置A：HAN优先 - 最适合多NUMA + 低主频
#===============================================================================

export OMPI_MCA_coll_hcoll_enable=0
export OMPI_MCA_coll_sm_priority=100
export OMPI_MCA_coll_tuned_use_dynamic_rules=1

#-------------------------------------------------------------------------------
# 核心：启用HAN，让它接管所有AllReduce
#-------------------------------------------------------------------------------
export OMPI_MCA_coll_han_priority=90
export OMPI_MCA_coll_han_allreduce_algorithm=3        # HAN内部用算法3
export OMPI_MCA_coll_han_allreduce_up_tree_fanout=4   # 16个NUMA代表用4叉树
export OMPI_MCA_coll_han_allreduce_down_tree_fanout=8

#  注意：不要设置coll_tuned_allreduce_algorithm，让HAN完全接管
# 如果HAN不支持某个消息大小，会自动fallback

#-------------------------------------------------------------------------------
# 其他优化
#-------------------------------------------------------------------------------
export OMPI_MCA_coll_tuned_bcast_algorithm=6
export OMPI_MCA_coll_tuned_bcast_segment_size=65536

export OMPI_MCA_mpi_yield_when_idle=0
export OMPI_MCA_mpi_leave_pinned=1

export OMPI_MCA_btl_sm_eager_limit=32768
export OMPI_MCA_btl_sm_max_send_size=262144
export OMPI_MCA_hwloc_base_binding_policy=numa

export OMPI_MCA_opal_progress_threads=0